//************************************ bs::framework - Copyright 2018 Marko Pintera **************************************//
//*********** Licensed under the MIT license. See LICENSE.md for full terms. This notice is not to be removed. ***********//
#pragma once

#include "BsRenderBeastPrerequisites.h"
#include "Utility/BsModule.h"
#include "Renderer/BsParamBlocks.h"
#include "Renderer/BsRendererMaterial.h"
#include "RenderAPI/BsGpuPipelineParamInfo.h"

namespace bs { namespace ct
{
	/** @addtogroup RenderBeast
	 *  @{
	 */

	BS_PARAM_BLOCK_BEGIN(RadixSortParamsDef)
		BS_PARAM_BLOCK_ENTRY(int, gBitOffset)
		BS_PARAM_BLOCK_ENTRY(int, gTilesPerGroup)
		BS_PARAM_BLOCK_ENTRY(int, gNumGroups)
		BS_PARAM_BLOCK_ENTRY(int, gNumExtraTiles)
		BS_PARAM_BLOCK_ENTRY(int, gNumExtraKeys)
	BS_PARAM_BLOCK_END

	extern RadixSortParamsDef gRadixSortParamsDef;
	
	/** Buffers to contain both inputs and outputs for the GpuSort algorithm. */
	struct GpuSortBuffers
	{
		SPtr<GpuBuffer> keys[2];
		SPtr<GpuBuffer> values[2];
	};

	/** Zeroes out the provided buffer. Should be called on the count output buffer before executing RadixSortCountMat. */
	class RadixSortClearMat : public RendererMaterial<RadixSortClearMat>
	{
		RMAT_DEF_CUSTOMIZED("RadixSortClear.bsl");

	public:
		RadixSortClearMat();

		/**
		 * Executes the material, running the compute kernel.
		 *
		 * @param[in]	outputCounts	Pre-allocated buffer to zero out.
		 */
		void execute(const SPtr<GpuBuffer>& outputCounts);

		GpuParamBuffer mOutputParam;
	};

	/**
	 * Counts the occurrences of each digit in the input key buffer. Keys are separated into chunks per group and generated
	 * counts only account for occurrences within that specific group. The resulting count can be provided to
	 * RadixSortPrefixScanMat to generate per-digit offsets.
	 */
	class RadixSortCountMat : public RendererMaterial<RadixSortCountMat>
	{
		RMAT_DEF_CUSTOMIZED("RadixSortCount.bsl");

	public:
		RadixSortCountMat();

		/**
		 * Executes the material, running the compute kernel.
		 *
		 * @param[in]	numGroups		Number of groups to separate the key inputs in.
		 * @param[in]	params			Allocated and initialized parameter buffer using RadixSortParamsDef definition.
		 * @param[in]	inputKeys		A set of keys to sort. @p params buffer needs to be initialized with data
		 *								representing this buffer.
		 * @param[in]	outputCounts	Pre-allocated buffer to contain the resulting per-digit counts for each group.
		 */
		void execute(UINT32 numGroups, const SPtr<GpuParamBlockBuffer>& params, const SPtr<GpuBuffer>& inputKeys,
			const SPtr<GpuBuffer>& outputCounts);

		GpuParamBuffer mInputKeysParam;
		GpuParamBuffer mOutputCountsParam;
	};

	/**
	 * Calculates the offsets required for sorting a set of keys. The input is expected to be generated by
	 * RadixSortCountMat. The offsets are generated per-group and let RadixSortReorderMat know at which global location to
	 * place the keys in that group.
	 */
	class RadixSortPrefixScanMat : public RendererMaterial<RadixSortPrefixScanMat>
	{
		RMAT_DEF_CUSTOMIZED("RadixSortPrefixScan.bsl");

	public:
		RadixSortPrefixScanMat();

		/*
		 * Executes the material, running the compute kernel.
		 *
		 * @param[in]	params			Allocated and initialized parameter buffer using RadixSortParamsDef definition.
		 * @param[in]	inputCounts		Counts as output by the RadixSortCountMat material.
		 * @param[in]	outputCounts	Pre-allocated buffer to contain the resulting per-digit counts for each group.
		 */
		void execute(const SPtr<GpuParamBlockBuffer>& params, const SPtr<GpuBuffer>& inputCounts,
			const SPtr<GpuBuffer>& outputOffsets);

		GpuParamBuffer mInputCountsParam;
		GpuParamBuffer mOutputOffsetsParam;
	};

	/**
	 * Applies offsets calculated by RadixSortPrefixScanMat in order to reorder a set of keys and/or values. The reordered
	 * values are output into a second set of key/value buffers.
	 */
	class RadixSortReorderMat : public RendererMaterial<RadixSortReorderMat>
	{
		RMAT_DEF_CUSTOMIZED("RadixSortReorder.bsl");

	public:
		RadixSortReorderMat();

		/*
		 * Executes the material, running the compute kernel.
		 *
		 * @param[in]	numGroups		Number of groups to separate the key inputs in.
		 * @param[in]	params			Allocated and initialized parameter buffer using RadixSortParamsDef definition.
		 * @param[in]	inputOffsets	Offsets as output by the RadixSortPrefixScanMat material.
		 * @param[in]	buffers			A set of input and output buffers. Key buffers are required, while value buffers
		 *								are optional.
		 * @param[in]	inputBufferIdx	Determines which of the two key/value buffers are considered to be inputs. The other
		 *								buffers are considered outputs and will contain the output data.
		 * @return						Index of the buffer that should be used as input for the next iteration of the
		 *								algorithm. (In case the keys were only partially sorted)
		 */
		void execute(UINT32 numGroups, const SPtr<GpuParamBlockBuffer>& params, const SPtr<GpuBuffer>& inputOffsets,
			const GpuSortBuffers& buffers, UINT32 inputBufferIdx);

		GpuParamBuffer mInputOffsetsBufferParam;
		GpuParamBuffer mInputKeysBufferParam;
		GpuParamBuffer mInputValuesBufferParam;
		GpuParamBuffer mOutputKeysBufferParam;
		GpuParamBuffer mOutputValuesBufferParam;
	};

	/**
	 * Performs a parallel radix sort using the GPU. Best used for sorting large amounts of data (5000+). (CPU is likely
	 * to be more performant for smaller sets).
	 */
	class GpuSort : public Module<GpuSort>
	{
	public:
		GpuSort();

		/**
		 * Sorts the input keys and outputs them in the provided buffer. Optionally also moves a set of values along with
		 * the keys.
		 *
		 * @param[in]	buffers		Buffers that contain the keys and values to sort, as well as space to put sorted keys
		 *							and values in. Both key buffers must be provided however value buffers are optional and
		 *							only required if you need to move values along with keys. All buffers must be be
		 *							standard buffers with 32-bit unsigned integer elements, with random writes enabled.
		 *							All buffers must be of the same size. Buffers respecting these rules can be created
		 *							easily be calling createSortBuffers().
		 *							
		 *							Buffers at index 0 will be used as input, and final output will be a buffer at the
		 *							index as returned by this method.
		 * @param[in]	numKeys		Number of keys to sort. This must be smaller or equal to the number of elements in the
		 *							provided buffers.
		 * @param[in]	keyMask		Mask that controls which portion of the keys are relevant for sorting. This serves as
		 *							an optimization as less bits that need to be sorted, the faster can the algorithm run.
		 *							(e.g. if you only need to sort the first 16 bits of the key, provide 0xFFFF as the
		 *							mask).
		 * @return					Index of the buffer in @p buffers that will contain the final sorted keys and/or values.
		 */
		UINT32 sort(const GpuSortBuffers& buffers, UINT32 numKeys, UINT32 keyMask = 0xFFFFFFFF);

		/**
		 * Creates a set of buffers buffer that can be used when calling the sort() method.
		 *
		 * @param[in]	numElements		Number of elements the buffers should contain.
		 * @param[in]	values			True if the value buffers should be created. If false only key buffers will be
		 *								created.
		 * @return						A set of buffers to store unsorted keys (and optionally values), as well as space
		 *								for the resulting sorted keys (and optionally values).
		 */
		static GpuSortBuffers createSortBuffers(UINT32 numElements, bool values = false);

	private:
		SPtr<GpuBuffer> mHelperBuffers[2];
	};

	/* @} */
}}
